Realizar uma análise de sentimento. Para a análise de sentimento criar:

  1. word cloud,
  2. word cloud positiva,
  3. word cloud negativa,
  4. semantic network,
  5. dendrograma,
  6. word cloud bigram
rm(list = ls())
cat("\014")

if(!require(tidytext)) {
  install.packages("tidytext")
  library(tidytext)
}
if(!require(dplyr)) {
  install.packages("dplyr")
  library(dplyr)
}
if(!require(stringr)) {
  install.packages("stringr")
  library(stringr)
}
if(!require(tm)) {
  install.packages("tm")
  library(tm)
}
if(!require(tidyr)) {
  install.packages("tidyr")
  library(tidyr)
}
if(!require(gutenbergr)) {
  install.packages("gutenbergr")
  library(gutenbergr)
}
if(!require(scales)) {
  install.packages("scales")
  library(scales)
}
if(!require(ggplot2)) {
  install.packages("ggplot2")
  library(ggplot2)
}
if(!require(wordcloud)) {
  install.packages("wordcloud")
  library(wordcloud)
}
if(!require(reshape2)) {
  install.packages("reshape2")
  library(reshape2)
}
if(!require(igraph)) {
  install.packages("igraph")
  library(igraph)
}
if(!require(ggraph)) {
  install.packages("ggraph")
  library(ggraph)
}
if(!require(widyr)) {
  install.packages("widyr")
  library(widyr)
}

Leitura do Texto

Para o trabalho foram escohidos textos relativos à Segunda Guerra Mundial.

France <- gutenberg_download(c(17813, 16437, 45542, 18483,9975), meta_fields = "title")
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
Dardanelles <- gutenberg_download(c(11008, 11513, 15896, 3306, 35119), meta_fields = "title")
#Russia <- gutenberg_download(c(46191, 10967, 22523, 10972, 53482), meta_fields = "title")
Russia <- gutenberg_download(c(46191, 10967, 22523, 10972), meta_fields = "title")


france_books <- France %>%
  group_by(gutenberg_id) %>%
  mutate(linenumber = row_number()) %>%
  ungroup()

dardanelles_books <- Dardanelles %>%
  group_by(gutenberg_id) %>%
  mutate(linenumber = row_number()) %>%
  ungroup()

russia_books <- Russia %>%
  group_by(gutenberg_id) %>%
  mutate(linenumber = row_number()) %>%
  ungroup()

Separando as palavras contidas nos textos que estão sendo analisados e removendo as “stop words”

france_tidy <- france_books %>% 
  unnest_tokens(input=text,
                output="word",
                to_lower=TRUE,
                drop=TRUE) %>%
  mutate(word=str_extract(word,"[a-z']+")) %>%
  anti_join(stop_words, by=c("word"="word")) %>%
  drop_na()

dardanelles_tidy <- dardanelles_books %>% 
  unnest_tokens(input=text,
                output="word",
                to_lower=TRUE,
                drop=TRUE) %>%
  mutate(word=str_extract(word,"[a-z']+")) %>%
  anti_join(stop_words, by=c("word"="word")) %>%
  drop_na()

russia_tidy <- russia_books %>% 
  unnest_tokens(input=text,
                output="word",
                to_lower=TRUE,
                drop=TRUE) %>%
  mutate(word=str_extract(word,"[a-z']+")) %>%
  anti_join(stop_words, by=c("word"="word")) %>%
  drop_na()

Contando as palavras para uma primeira análise da frequência:

count(france_tidy, word, sort=TRUE) %>% top_n(10)
## # A tibble: 10 x 2
##    word        n
##    <chr>   <int>
##  1 french    599
##  2 war       589
##  3 day       518
##  4 german    441
##  5 time      422
##  6 paris     396
##  7 france    386
##  8 captain   248
##  9 germans   237
## 10 left      222
count(dardanelles_tidy, word, sort=TRUE) %>% top_n(10)
## # A tibble: 10 x 2
##    word      n
##    <chr> <int>
##  1 ken     699
##  2 time    444
##  3 roy     373
##  4 day     312
##  5 night   294
##  6 war     259
##  7 left    246
##  8 water   244
##  9 turks   239
## 10 front   217
count(russia_tidy, word, sort=TRUE) %>% top_n(10)
## # A tibble: 10 x 2
##    word          n
##    <chr>     <int>
##  1 russian     965
##  2 american    798
##  3 british     568
##  4 russia      520
##  5 company     480
##  6 archangel   457
##  7 time        434
##  8 front       415
##  9 soldiers    383
## 10 day         382

Analisando a frequência das palavras nos conjuntos de texto

# Frequency as per the book
#frequency <- bind_rows(mutate(france_tidy, subject = "France"), 
#                       mutate(dardanelles_tidy, subject = "Dardanelles"),
#                       mutate(russia_tidy, subject = "Russia")) %>% 
#  mutate(word = str_extract(word, "[a-z']+")) %>%
#  count(subject, word) %>%
#  group_by(subject) %>%
#  mutate(proportion = n / sum(n)) %>% 
#  select(-n) %>% 
#  spread(subject, proportion) %>% 
#  gather(subject, proportion,`Personal Narratives`:`WWII Fiction`)

# Frequency step-by-step
binded_texts <- bind_rows(mutate(france_tidy, subject = "France"), 
                          mutate(dardanelles_tidy, subject = "Dardanelles"),
                          mutate(russia_tidy, subject = "Russia"))

counted_texts <- count(binded_texts, subject, word)

grouped_texts <- group_by(counted_texts,subject)

grouped_texts_by_proportion <- mutate(grouped_texts, proportion = n / sum(n))

selected_texts <- select(grouped_texts_by_proportion,-n)

spreaded_text <- spread(selected_texts, subject, proportion)

frequency <- gather(spreaded_text,subject, proportion,`Dardanelles`:`France`)

Gráfico de proporção das palavras dos livros sobre a França e Dardanelles vs. Russia:

ggplot(frequency, aes(x = proportion, y = `Russia`, color = abs(`Russia` - proportion))) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  facet_wrap(~subject, ncol = 2) +
  theme(legend.position="none") +
  labs(y = "Russia", x = NULL)

Teste de Correlação entre os textos

cor.test(data = frequency[frequency$subject == "Dardanelles",],
         ~ proportion + `Russia`)
## 
##  Pearson's product-moment correlation
## 
## data:  proportion and Russia
## t = 51.042, df = 7470, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4915007 0.5251281
## sample estimates:
##       cor 
## 0.5085082
cor.test(data = frequency[frequency$subject == "France",],
         ~ proportion + `Russia`)
## 
##  Pearson's product-moment correlation
## 
## data:  proportion and Russia
## t = 65.76, df = 7805, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5826256 0.6111792
## sample estimates:
##       cor 
## 0.5970915

Análise de Sentimentos

Análise de sentimentos dos livros da França:

france_sentiment <- france_tidy %>%
  inner_join(get_sentiments("bing")) %>%
  count(title, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

ggplot(france_sentiment, aes(index, sentiment, fill = title)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~title, ncol = 2, scales = "free_x")

Análise de sentimentos dos livros de Dardonelles:

dardanelles_sentiment <- dardanelles_tidy %>%
  inner_join(get_sentiments("bing")) %>%
  count(title, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

ggplot(dardanelles_sentiment, aes(index, sentiment, fill = title)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~title, ncol = 2, scales = "free_x")

Análise de sentimentos dos livros da Rússia:

russia_sentiment <- russia_tidy %>%
  inner_join(get_sentiments("bing")) %>%
  count(title, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

ggplot(russia_sentiment, aes(index, sentiment, fill = title)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~title, ncol = 2, scales = "free_x")

Nuvens de Palavras

Nuvem de palavras dos livros da França:

france_tidy %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

Nuvem de palavras dos livros de Dardanelles:

dardanelles_tidy %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

Nuvem de palavras dos livros da Rússia:

russia_tidy %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

Nuvem de palavras com divisão dos sentimentos dos livros da França:

france_tidy %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 50)

Nuvem de palavras com divisão dos sentimentos dos livros de Dardanelles:

dardanelles_tidy %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 50)

Nuvem de palavras com divisão dos sentimentos dos livros da Rússia:

russia_tidy %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 50)

Análise da frquência de palavras nos diferentes livros:

França:

france_book_words <- france_books %>%
  unnest_tokens(input=text,
                          output="word",
                          to_lower=TRUE,
                          drop=TRUE) %>%
  mutate(word=str_extract(word,"[a-z']+")) %>%
  anti_join(stop_words, by=c("word"="word")) %>%
  count(title, word, sort = TRUE) %>%
  ungroup()

france_total_words <- france_book_words %>% 
  group_by(title) %>% 
  summarize(total = sum(n))

france_book_words <- left_join(france_book_words, france_total_words)

ggplot(france_book_words, aes(n/total, fill = title)) +
  geom_histogram(show.legend = FALSE, bins = 25) +
  xlim(NA, 0.0009) +
  facet_wrap(~title, ncol = 2, scales = "free_y")

Dardanelles:

dardanelles_book_words <- dardanelles_books %>%
  unnest_tokens(input=text,
                          output="word",
                          to_lower=TRUE,
                          drop=TRUE) %>%
  mutate(word=str_extract(word,"[a-z']+")) %>%
  anti_join(stop_words, by=c("word"="word")) %>%
  count(title, word, sort = TRUE) %>%
  ungroup()

dardanelles_total_words <- dardanelles_book_words %>% 
  group_by(title) %>% 
  summarize(total = sum(n))

dardanelles_book_words <- left_join(dardanelles_book_words, dardanelles_total_words)

ggplot(dardanelles_book_words, aes(n/total, fill = title)) +
  geom_histogram(show.legend = FALSE, bins = 25) +
  xlim(NA, 0.0009) +
  facet_wrap(~title, ncol = 2, scales = "free_y")

Rússia

russia_book_words <- russia_books %>%
  unnest_tokens(input=text,
                          output="word",
                          to_lower=TRUE,
                          drop=TRUE) %>%
  mutate(word=str_extract(word,"[a-z']+")) %>%
  anti_join(stop_words, by=c("word"="word")) %>%
  count(title, word, sort = TRUE) %>%
  ungroup()

russia_total_words <- russia_book_words %>% 
  group_by(title) %>% 
  summarize(total = sum(n))

russia_book_words <- left_join(russia_book_words, russia_total_words)

ggplot(russia_book_words, aes(n/total, fill = title)) +
  geom_histogram(show.legend = FALSE, bins = 25) +
  xlim(NA, 0.0009) +
  facet_wrap(~title, ncol = 2, scales = "free_y")

Análise da comparação da frequência dos termos nos livros:

França

france_freq_by_rank <- france_book_words %>% 
  drop_na() %>%
  group_by(title) %>% 
  mutate(rank = row_number(), 
         `term frequency` = n/total)


france_rank_subset <- france_freq_by_rank %>% 
  filter(rank < 500,
         rank > 10)

france_lm <- lm(log10(`term frequency`) ~ log10(rank), data = france_rank_subset)

france_freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = title)) + 
  geom_abline(intercept = france_lm$coefficients[1], slope = france_lm$coefficients[2], color = "gray50", linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

Dardanelles:

dardanelles_freq_by_rank <- dardanelles_book_words %>% 
  drop_na() %>%
  group_by(title) %>% 
  mutate(rank = row_number(), 
         `term frequency` = n/total)

dardanelles_rank_subset <- dardanelles_freq_by_rank %>% 
  filter(rank < 500,
         rank > 10)

dardanelles_lm <- lm(log10(`term frequency`) ~ log10(rank), data = dardanelles_rank_subset)

dardanelles_freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = title)) + 
  geom_abline(intercept = dardanelles_lm$coefficients[1], slope = dardanelles_lm$coefficients[2], color = "gray50", linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

Rússia:

russia_freq_by_rank <- russia_book_words %>% 
  drop_na() %>%
  group_by(title) %>% 
  mutate(rank = row_number(), 
         `term frequency` = n/total)

russia_rank_subset <- russia_freq_by_rank %>% 
  filter(rank < 500,
         rank > 10)

russia_lm <- lm(log10(`term frequency`) ~ log10(rank), data = russia_rank_subset)

russia_freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = title)) + 
  geom_abline(intercept = russia_lm$coefficients[1], slope = russia_lm$coefficients[2], color = "gray50", linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

TF-IDF (Term Frequency - Inverse Document Frequency)

França

france_book_words <- france_book_words %>%
  bind_tf_idf(word, title, n)

france_book_words %>%
  select(-total) %>%
  arrange(desc(tf_idf))
## # A tibble: 25,599 x 6
##    title                               word        n      tf   idf  tf_idf
##    <chr>                               <chr>   <int>   <dbl> <dbl>   <dbl>
##  1 "The Children of France\r\nA Book … lad        63 0.00674 1.61  0.0108 
##  2 "The Children of France\r\nA Book … remi       45 0.00481 1.61  0.00775
##  3 At Ypres with Best-Dunkley          dunkley    90 0.00458 1.61  0.00736
##  4 "The Children of France\r\nA Book … mattia     30 0.00321 1.61  0.00517
##  5 At Ypres with Best-Dunkley          andrews    63 0.00320 1.61  0.00515
##  6 "The Children of France\r\nA Book … marie      50 0.00535 0.916 0.00490
##  7 At Ypres with Best-Dunkley          platoon    96 0.00488 0.916 0.00447
##  8 "Paris War Days\nDiary of an Ameri… centig…    48 0.00264 1.61  0.00425
##  9 "The Children of France\r\nA Book … prussi…    43 0.00460 0.916 0.00422
## 10 "Paris War Days\nDiary of an Ameri… paris     337 0.0185  0.223 0.00413
## # ... with 25,589 more rows
france_book_words %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(title) %>% 
  top_n(5) %>% 
  ungroup %>%
  ggplot(aes(word, tf_idf, fill = title)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~title, ncol = 2, scales = "free") +
  coord_flip()

Dardanelles

dardanelles_book_words <- dardanelles_book_words %>%
  bind_tf_idf(word, title, n)

dardanelles_book_words %>%
  select(-total) %>%
  arrange(desc(tf_idf))
## # A tibble: 25,365 x 6
##    title                             word          n      tf   idf  tf_idf
##    <chr>                             <chr>     <int>   <dbl> <dbl>   <dbl>
##  1 On Land and Sea at the Dardanell… ken         698 0.0353  0.916 0.0324 
##  2 On Land and Sea at the Dardanell… roy         373 0.0189  1.61  0.0304 
##  3 "Five Months at Anzac\r\nA Narra… postage     129 0.0122  0.916 0.0112 
##  4 On Land and Sea at the Dardanell… dave         82 0.00415 1.61  0.00668
##  5 "At Suvla Bay\r\nBeing the notes… hawk         73 0.00656 0.916 0.00601
##  6 On Land and Sea at the Dardanell… ken's        73 0.00369 1.61  0.00594
##  7 On Land and Sea at the Dardanell… carringt…    60 0.00304 1.61  0.00489
##  8 "Five Months at Anzac\r\nA Narra… revised      31 0.00294 1.61  0.00473
##  9 "Trenching at Gallipoli\r\nThe p… dugout       40 0.00264 1.61  0.00426
## 10 "Trenching at Gallipoli\r\nThe p… newfound…    40 0.00264 1.61  0.00426
## # ... with 25,355 more rows
dardanelles_book_words %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(title) %>% 
  top_n(5) %>% 
  ungroup %>%
  ggplot(aes(word, tf_idf, fill = title)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~title, ncol = 2, scales = "free") +
  coord_flip()
## Selecting by tf_idf

Rússia:

russia_book_words <- russia_book_words %>%
  bind_tf_idf(word, title, n)

russia_book_words %>%
  select(-total) %>%
  arrange(desc(tf_idf))
## # A tibble: 22,784 x 6
##    title                               word        n      tf   idf  tf_idf
##    <chr>                               <chr>   <int>   <dbl> <dbl>   <dbl>
##  1 "With the \"Die-Hards\" in Siberia" omsk      116 0.00411 1.39  0.00570
##  2 "The History of the American Exped… pvt       228 0.00356 1.39  0.00494
##  3 "With the \"Die-Hards\" in Siberia" czech      92 0.00326 1.39  0.00452
##  4 "With the \"Die-Hards\" in Siberia" japane…   174 0.00617 0.693 0.00427
##  5 Fighting Without a War: An Account… british   111 0.0147  0.288 0.00421
##  6 "With the \"Die-Hards\" in Siberia" koltch…    77 0.00273 1.39  0.00378
##  7 Fighting Without a War: An Account… russia     89 0.0117  0.288 0.00338
##  8 Fighting Without a War: An Account… bolshe…    85 0.0112  0.288 0.00323
##  9 Four Weeks in the Trenches: The Wa… trench     22 0.00441 0.693 0.00305
## 10 Fighting Without a War: An Account… kitsa      28 0.00370 0.693 0.00256
## # ... with 22,774 more rows
russia_book_words %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(title) %>% 
  top_n(5) %>% 
  ungroup %>%
  ggplot(aes(word, tf_idf, fill = title)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~title, ncol = 2, scales = "free") +
  coord_flip()

Análise de Bigramas

França:

france_bigrams <- france_books %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

france_bigrams %>%
  count(bigram, sort = TRUE)
## # A tibble: 97,692 x 2
##    bigram      n
##    <chr>   <int>
##  1 <NA>     5344
##  2 of the   2217
##  3 in the   1385
##  4 to the    907
##  5 on the    610
##  6 at the    464
##  7 and the   422
##  8 for the   404
##  9 to be     403
## 10 it was    381
## # ... with 97,682 more rows
france_bigrams_separated <- france_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

france_bigrams_filtered <- france_bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

france_bigram_counts <- france_bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)

france_bigrams_united <- france_bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")

france_bigram_tf_idf <- france_bigrams_united %>%
  count(title, bigram) %>%
  bind_tf_idf(bigram, title, n) %>%
  arrange(desc(tf_idf))

france_bigram_tf_idf %>%
  arrange(desc(tf_idf)) %>%
  mutate(bigram = factor(bigram, levels = rev(unique(bigram)))) %>% 
  group_by(title) %>% 
  top_n(5) %>% 
  ungroup %>%
  ggplot(aes(bigram, tf_idf, fill = title)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~title, ncol = 2, scales = "free") +
  coord_flip()

france_bigram_counts %>%
  drop_na() %>%
  filter(n >= 15) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "royalblue") +
  geom_node_point(size = 5) +
  geom_node_text(aes(label = name), repel = TRUE,
                 point.padding = unit(0.2, "lines")) +
  theme_void()

Dardanelles:

dardanelles_bigrams <- dardanelles_books %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

dardanelles_bigrams %>%
  count(bigram, sort = TRUE)
## # A tibble: 102,363 x 2
##    bigram       n
##    <chr>    <int>
##  1 <NA>      6449
##  2 of the    1783
##  3 in the    1143
##  4 to the     775
##  5 and the    671
##  6 on the     659
##  7 it was     627
##  8 in a       404
##  9 from the   401
## 10 at the     390
## # ... with 102,353 more rows
dardanelles_bigrams_separated <- dardanelles_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

dardanelles_bigrams_filtered <- dardanelles_bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

dardanelles_bigram_counts <- dardanelles_bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)

dardanelles_bigrams_united <- dardanelles_bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")

dardanelles_bigram_tf_idf <- dardanelles_bigrams_united %>%
  count(title, bigram) %>%
  bind_tf_idf(bigram, title, n) %>%
  arrange(desc(tf_idf))

dardanelles_bigram_tf_idf %>%
  arrange(desc(tf_idf)) %>%
  mutate(bigram = factor(bigram, levels = rev(unique(bigram)))) %>% 
  group_by(title) %>% 
  top_n(5) %>% 
  ungroup %>%
  ggplot(aes(bigram, tf_idf, fill = title)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~title, ncol = 2, scales = "free") +
  coord_flip()

dardanelles_bigram_counts %>%
  drop_na() %>%
  filter(n >= 15) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "royalblue") +
  geom_node_point(size = 5) +
  geom_node_text(aes(label = name), repel = TRUE,
                 point.padding = unit(0.2, "lines")) +
  theme_void()

Rússia

russia_bigrams <- russia_books %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

russia_bigrams %>%
  count(bigram, sort = TRUE)
## # A tibble: 109,418 x 2
##    bigram       n
##    <chr>    <int>
##  1 <NA>      4220
##  2 of the    2776
##  3 in the    1371
##  4 to the    1081
##  5 on the     878
##  6 and the    691
##  7 it was     505
##  8 for the    497
##  9 by the     493
## 10 had been   444
## # ... with 109,408 more rows
russia_bigrams_separated <- russia_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

russia_bigrams_filtered <- russia_bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

russia_bigram_counts <- russia_bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)

russia_bigrams_united <- russia_bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")

russia_bigram_tf_idf <- russia_bigrams_united %>%
  count(title, bigram) %>%
  bind_tf_idf(bigram, title, n) %>%
  arrange(desc(tf_idf))

russia_bigram_tf_idf %>%
  arrange(desc(tf_idf)) %>%
  mutate(bigram = factor(bigram, levels = rev(unique(bigram)))) %>% 
  group_by(title) %>% 
  top_n(5) %>% 
  ungroup %>%
  ggplot(aes(bigram, tf_idf, fill = title)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~title, ncol = 2, scales = "free") +
  coord_flip()

russia_bigram_counts %>%
  drop_na() %>%
  filter(n >= 20) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "royalblue") +
  geom_node_point(size = 5) +
  geom_node_text(aes(label = name), repel = TRUE,
                 point.padding = unit(0.2, "lines")) +
  theme_void()

Análise de Sentimentos dos Bigramas

França

france_bigrams_separated %>%
  filter(word1 == "not") %>%
  count(word1, word2, sort = TRUE)
## # A tibble: 430 x 3
##    word1 word2     n
##    <chr> <chr> <int>
##  1 not   only     52
##  2 not   a        50
##  3 not   be       46
##  4 not   yet      32
##  5 not   been     30
##  6 not   know     29
##  7 not   to       27
##  8 not   the      25
##  9 not   get      17
## 10 not   have     15
## # ... with 420 more rows
france_not_words <- france_bigrams_separated %>%
  filter(word1 == "not") %>%
  inner_join(get_sentiments("afinn"), by = c(word2 = "word")) %>%
  count(word2, score, sort = TRUE) %>%
  ungroup()

france_not_words %>%
  mutate(contribution = n * score) %>%
  arrange(desc(abs(contribution))) %>%
  head(20) %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(word2, n * score, fill = n * score > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("Words preceded by \"not\"") +
  ylab("Sentiment score * number of occurrences") +
  coord_flip()

negation_words <- c("not", "no", "never", "without")

france_negated_words <- france_bigrams_separated %>%
  filter(word1 %in% negation_words) %>%
  inner_join(get_sentiments("afinn"), by = c(word2 = "word")) %>%
  count(word1, word2, score, sort = TRUE) %>%
  ungroup()

france_negated_words %>%
  mutate(contribution = n * score) %>%
  arrange(desc(abs(contribution))) %>%
  head(20) %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(word2, n * score, fill = n * score > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("Words preceded by \"not\"") +
  ylab("Sentiment score * number of occurrences") +
  coord_flip()

Dardanelles

dardanelles_bigrams_separated %>%
  filter(word1 == "not") %>%
  count(word1, word2, sort = TRUE)
## # A tibble: 330 x 3
##    word1 word2     n
##    <chr> <chr> <int>
##  1 not   a        53
##  2 not   to       27
##  3 not   the      25
##  4 not   only     24
##  5 not   be       20
##  6 not   so       17
##  7 not   know     16
##  8 not   been     15
##  9 not   yet      15
## 10 not   get      14
## # ... with 320 more rows
dardanelles_not_words <- dardanelles_bigrams_separated %>%
  filter(word1 == "not") %>%
  inner_join(get_sentiments("afinn"), by = c(word2 = "word")) %>%
  count(word2, score, sort = TRUE) %>%
  ungroup()

dardanelles_not_words %>%
  mutate(contribution = n * score) %>%
  arrange(desc(abs(contribution))) %>%
  head(20) %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(word2, n * score, fill = n * score > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("Words preceded by \"not\"") +
  ylab("Sentiment score * number of occurrences") +
  coord_flip()

negation_words <- c("not", "no", "never", "without")

dardanelles_negated_words <- dardanelles_bigrams_separated %>%
  filter(word1 %in% negation_words) %>%
  inner_join(get_sentiments("afinn"), by = c(word2 = "word")) %>%
  count(word1, word2, score, sort = TRUE) %>%
  ungroup()

dardanelles_negated_words %>%
  mutate(contribution = n * score) %>%
  arrange(desc(abs(contribution))) %>%
  head(20) %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(word2, n * score, fill = n * score > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("Words preceded by \"not\"") +
  ylab("Sentiment score * number of occurrences") +
  coord_flip()

Rússia:

russia_bigrams_separated %>%
  filter(word1 == "not") %>%
  count(word1, word2, sort = TRUE)
## # A tibble: 411 x 3
##    word1 word2     n
##    <chr> <chr> <int>
##  1 not   be       40
##  2 not   a        38
##  3 not   to       35
##  4 not   the      32
##  5 not   know     24
##  6 not   only     23
##  7 not   been     22
##  8 not   have     19
##  9 not   one      18
## 10 not   see      16
## # ... with 401 more rows
russia_not_words <- russia_bigrams_separated %>%
  filter(word1 == "not") %>%
  inner_join(get_sentiments("afinn"), by = c(word2 = "word")) %>%
  count(word2, score, sort = TRUE) %>%
  ungroup()

russia_not_words %>%
  mutate(contribution = n * score) %>%
  arrange(desc(abs(contribution))) %>%
  head(20) %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(word2, n * score, fill = n * score > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("Words preceded by \"not\"") +
  ylab("Sentiment score * number of occurrences") +
  coord_flip()

negation_words <- c("not", "no", "never", "without")

russia_negated_words <- russia_bigrams_separated %>%
  filter(word1 %in% negation_words) %>%
  inner_join(get_sentiments("afinn"), by = c(word2 = "word")) %>%
  count(word1, word2, score, sort = TRUE) %>%
  ungroup()

russia_negated_words %>%
  mutate(contribution = n * score) %>%
  arrange(desc(abs(contribution))) %>%
  head(20) %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(word2, n * score, fill = n * score > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("Words preceded by \"not\"") +
  ylab("Sentiment score * number of occurrences") +
  coord_flip()

Impressão de grafos dos bigramas:

França

france_bigram_graph <- france_bigram_counts %>%
  drop_na() %>%
  filter(n > 10) %>%
  graph_from_data_frame()

france_bigram_graph
## IGRAPH a402b8f DN-- 79 50 -- 
## + attr: name (v/c), n (e/n)
## + edges from a402b8f (vertex names):
##  [1] red       ->cross      captain   ->andrews    degrees   ->centigrade
##  [4] major     ->brighten   alsace    ->lorraine   machine   ->gun       
##  [7] sergeant  ->major      lancashire->fusiliers  front     ->line      
## [10] herr      ->von        lance     ->corporal   sergeant  ->baldwin   
## [13] french    ->army       machine   ->guns       2         ->5th       
## [16] foreign   ->affairs    french    ->soldiers   bilge     ->trench    
## [19] german    ->army       de        ->la         rue       ->de        
## [22] war       ->office     le        ->capitaine  military  ->governor  
## + ... omitted several edges
set.seed(2018)

ggraph(france_bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

a <- grid::arrow(type = "closed", length = unit(.15, "inches"))

ggraph(france_bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void()

Dardanelles:

dardanelles_bigram_graph <- dardanelles_bigram_counts %>%
  drop_na() %>%
  filter(n > 10) %>%
  graph_from_data_frame()

dardanelles_bigram_graph
## IGRAPH a670099 DN-- 78 57 -- 
## + attr: name (v/c), n (e/n)
## + edges from a670099 (vertex names):
##  [1] _postage ->1d         firing   ->line       6d       ->_postage  
##  [4] 3s       ->6d         hundred  ->yards      stretcher->bearers   
##  [7] _postage ->2d         salt     ->lake       cloth    ->gilt      
## [10] machine  ->gun        ken      ->answered   red      ->cross     
## [13] suvla    ->bay        bully    ->beef       gilt     ->gilt      
## [16] top      ->3s         6s       ->_postage   dressing ->station   
## [19] gilt     ->top        edges    ->6s         hospital ->ship      
## [22] morocco  ->gilt       barbed   ->wire       captain  ->carrington
## + ... omitted several edges
set.seed(2018)

ggraph(dardanelles_bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

a <- grid::arrow(type = "closed", length = unit(.15, "inches"))

ggraph(dardanelles_bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void()

Rússia:

russia_bigram_graph <- russia_bigram_counts %>%
  drop_na() %>%
  filter(n > 10) %>%
  graph_from_data_frame()

russia_bigram_graph
## IGRAPH 88c7121 DN-- 138 117 -- 
## + attr: name (v/c), n (e/n)
## + edges from 88c7121 (vertex names):
##  [1] north     ->russia   machine   ->gun      339th     ->inf     
##  [4] red       ->cross    machine   ->guns     american  ->soldiers
##  [7] official  ->photo    north     ->russian  american  ->troops  
## [10] ust       ->padenga  red       ->guards   admiral   ->koltchak
## [13] russian   ->people   british   ->officer  russian   ->army    
## [16] british   ->officers commanding->officer  dvina     ->river   
## [19] american  ->officer  american  ->soldier  supreme   ->governor
## [22] 310th     ->engrs   
## + ... omitted several edges
set.seed(2018)

ggraph(russia_bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

a <- grid::arrow(type = "closed", length = unit(.15, "inches"))

ggraph(russia_bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void()

Correlação dos Bigramas

França:

france_section_words <- france_books %>%
  mutate(section = row_number() %/% 10) %>%
  filter(section > 0) %>%
  unnest_tokens(word, text) %>%
  mutate(word=str_extract(word,"[a-z']+")) %>%
  filter(!word %in% stop_words$word)

france_section_words
## # A tibble: 83,918 x 5
##    gutenberg_id title                           linenumber section word   
##           <int> <chr>                                <int>   <dbl> <chr>  
##  1         9975 "Paris War Days\nDiary of an A…         11       1 diary  
##  2         9975 "Paris War Days\nDiary of an A…         11       1 americ…
##  3         9975 "Paris War Days\nDiary of an A…         19       1 charles
##  4         9975 "Paris War Days\nDiary of an A…         19       1 inman  
##  5         9975 "Paris War Days\nDiary of an A…         19       1 barnard
##  6         9975 "Paris War Days\nDiary of an A…         19       1 ll     
##  7         9975 "Paris War Days\nDiary of an A…         19       1 harvard
##  8         9975 "Paris War Days\nDiary of an A…         21       2 knight 
##  9         9975 "Paris War Days\nDiary of an A…         21       2 legion 
## 10         9975 "Paris War Days\nDiary of an A…         21       2 honor  
## # ... with 83,908 more rows
france_word_pairs <- france_section_words %>%
  pairwise_count(word, section, sort = TRUE) %>%
  drop_na()

france_word_pairs
## # A tibble: 2,042,368 x 3
##    item1  item2      n
##    <chr>  <chr>  <dbl>
##  1 french german   119
##  2 german french   119
##  3 french war      105
##  4 war    french   105
##  5 day    war      104
##  6 war    day      104
##  7 france war       92
##  8 war    france    92
##  9 german war       89
## 10 war    german    89
## # ... with 2,042,358 more rows
france_word_cors <- france_section_words %>%
  na.omit() %>%
  group_by(word) %>%
  filter(n() >= 20) %>%
  pairwise_cor(word, section, sort = TRUE)

france_word_cors
## # A tibble: 652,056 x 3
##    item1       item2       correlation
##    <chr>       <chr>             <dbl>
##  1 centigrade  degrees           0.906
##  2 degrees     centigrade        0.906
##  3 lancashire  fusiliers         0.820
##  4 fusiliers   lancashire        0.820
##  5 corporal    lance             0.806
##  6 lance       corporal          0.806
##  7 lorraine    alsace            0.773
##  8 alsace      lorraine          0.773
##  9 centigrade  thermometer       0.704
## 10 thermometer centigrade        0.704
## # ... with 652,046 more rows
france_word_cors %>%
  filter(item1 %in% c("lorraine", "lancashire", "fusiliers", "von")) %>%
  group_by(item1) %>%
  top_n(6) %>%
  ungroup() %>%
  mutate(item2 = reorder(item2, correlation)) %>%
  ggplot(aes(item2, correlation)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ item1, scales = "free") +
  coord_flip()

set.seed(2018)

france_word_cors %>%
  filter(correlation > .25) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()

france_word_cors %>%
  filter(correlation > .25) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), edge_colour = "cyan4") +
  geom_node_point(size = 5) +
  geom_node_text(aes(label = name), repel = TRUE, 
                 point.padding = unit(0.2, "lines")) +
  theme_void()

Dardanelles:

dardanelles_section_words <- dardanelles_books %>%
  mutate(section = row_number() %/% 10) %>%
  filter(section > 0) %>%
  unnest_tokens(word, text) %>%
  mutate(word=str_extract(word,"[a-z']+")) %>%
  filter(!word %in% stop_words$word)

dardanelles_section_words
## # A tibble: 86,828 x 5
##    gutenberg_id title                           linenumber section word   
##           <int> <chr>                                <int>   <dbl> <chr>  
##  1         3306 "At Suvla Bay\r\nBeing the not…         11       1 serving
##  2         3306 "At Suvla Bay\r\nBeing the not…         11       1 field  
##  3         3306 "At Suvla Bay\r\nBeing the not…         11       1 ambula…
##  4         3306 "At Suvla Bay\r\nBeing the not…         11       1 divisi…
##  5         3306 "At Suvla Bay\r\nBeing the not…         11       1 medite…
##  6         3306 "At Suvla Bay\r\nBeing the not…         12       1 expedi…
##  7         3306 "At Suvla Bay\r\nBeing the not…         12       1 force  
##  8         3306 "At Suvla Bay\r\nBeing the not…         12       1 war    
##  9         3306 "At Suvla Bay\r\nBeing the not…         18       1 minobi 
## 10         3306 "At Suvla Bay\r\nBeing the not…         20       2 played 
## # ... with 86,818 more rows
dardanelles_word_pairs <- dardanelles_section_words %>%
  pairwise_count(word, section, sort = TRUE) %>%
  drop_na()

dardanelles_word_pairs
## # A tibble: 2,169,572 x 3
##    item1    item2        n
##    <chr>    <chr>    <dbl>
##  1 roy      ken        215
##  2 ken      roy        215
##  3 ken      time        83
##  4 time     ken         83
##  5 ken      answered    80
##  6 answered ken         80
##  7 postage  cloth       76
##  8 cloth    postage     76
##  9 ken      moment      67
## 10 moment   ken         67
## # ... with 2,169,562 more rows
dardanelles_word_cors <- dardanelles_section_words %>%
  na.omit() %>%
  group_by(word) %>%
  filter(n() >= 20) %>%
  pairwise_cor(word, section, sort = TRUE)

dardanelles_word_cors
## # A tibble: 818,120 x 3
##    item1   item2   correlation
##    <chr>   <chr>         <dbl>
##  1 morocco gilt          0.905
##  2 gilt    morocco       0.905
##  3 edges   morocco       0.904
##  4 morocco edges         0.904
##  5 lake    salt          0.876
##  6 salt    lake          0.876
##  7 edges   gilt          0.859
##  8 gilt    edges         0.859
##  9 beef    bully         0.836
## 10 bully   beef          0.836
## # ... with 818,110 more rows
dardanelles_word_cors %>%
  filter(item1 %in% c("edges", "morocco", "jhill", "gilt")) %>%
  group_by(item1) %>%
  top_n(6) %>%
  ungroup() %>%
  mutate(item2 = reorder(item2, correlation)) %>%
  ggplot(aes(item2, correlation)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ item1, scales = "free") +
  coord_flip()

set.seed(2018)

dardanelles_word_cors %>%
  filter(correlation > .25) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()

dardanelles_word_cors %>%
  filter(correlation > .25) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), edge_colour = "cyan4") +
  geom_node_point(size = 5) +
  geom_node_text(aes(label = name), repel = TRUE, 
                 point.padding = unit(0.2, "lines")) +
  theme_void()

Rússia:

russia_section_words <- russia_books %>%
  mutate(section = row_number() %/% 10) %>%
  filter(section > 0) %>%
  unnest_tokens(word, text) %>%
  mutate(word=str_extract(word,"[a-z']+")) %>%
  filter(!word %in% stop_words$word)

russia_section_words
## # A tibble: 104,754 x 5
##    gutenberg_id title                           linenumber section word   
##           <int> <chr>                                <int>   <dbl> <chr>  
##  1        10967 Four Weeks in the Trenches: Th…         10       1 life   
##  2        10967 Four Weeks in the Trenches: Th…         10       1 dedica…
##  3        10967 Four Weeks in the Trenches: Th…         10       1 book   
##  4        10967 Four Weeks in the Trenches: Th…         12       1 humble 
##  5        10967 Four Weeks in the Trenches: Th…         12       1 token  
##  6        10967 Four Weeks in the Trenches: Th…         12       1 everla…
##  7        10967 Four Weeks in the Trenches: Th…         12       1 gratit…
##  8        10967 Four Weeks in the Trenches: Th…         12       1 devoti…
##  9        10967 Four Weeks in the Trenches: Th…         17       1 preface
## 10        10967 Four Weeks in the Trenches: Th…         19       1 record 
## # ... with 104,744 more rows
russia_word_pairs <- russia_section_words %>%
  pairwise_count(word, section, sort = TRUE) %>%
  drop_na()

russia_word_pairs
## # A tibble: 2,576,030 x 3
##    item1     item2         n
##    <chr>     <chr>     <dbl>
##  1 russian   american    167
##  2 american  russian     167
##  3 british   american    146
##  4 american  british     146
##  5 british   russian     141
##  6 north     russia      141
##  7 russia    north       141
##  8 russian   british     141
##  9 archangel american    138
## 10 american  archangel   138
## # ... with 2,576,020 more rows
russia_word_cors <- russia_section_words %>%
  na.omit() %>%
  group_by(word) %>%
  filter(n() >= 20) %>%
  pairwise_cor(word, section, sort = TRUE)

russia_word_cors
## # A tibble: 1,097,256 x 3
##    item1        item2        correlation
##    <chr>        <chr>              <dbl>
##  1 padenga      ust                0.910
##  2 ust          padenga            0.910
##  3 pvt          mich               0.734
##  4 mich         pvt                0.734
##  5 photo        official           0.724
##  6 official     photo              0.724
##  7 photo        illustration       0.723
##  8 illustration photo              0.723
##  9 governor     supreme            0.686
## 10 supreme      governor           0.686
## # ... with 1,097,246 more rows
russia_word_cors %>%
  filter(item1 %in% c("padenga", "revolutionary", "comrade", "lieut")) %>%
  group_by(item1) %>%
  top_n(6) %>%
  ungroup() %>%
  mutate(item2 = reorder(item2, correlation)) %>%
  ggplot(aes(item2, correlation)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ item1, scales = "free") +
  coord_flip()

set.seed(2018)

russia_word_cors %>%
  filter(correlation > .25) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()

russia_word_cors %>%
  filter(correlation > .25) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), edge_colour = "cyan4") +
  geom_node_point(size = 5) +
  geom_node_text(aes(label = name), repel = TRUE, 
                 point.padding = unit(0.2, "lines")) +
  theme_void()